import fitz  # PyMuPDF
import cv2
import tabula
import numpy as np

# 读取PDF文件
p = '/home/FAST_DATA_MIRROR/Langchain-Chatchat-master/云鹰平台服务系统接口协议V1.8.4.pdf'
doc = fitz.open(p)

for i in range(len(doc)):
    # 将PDF页面转换为图片
    pix = doc.get_page_pixmap(i)
    img = np.frombuffer(pix.samples, np.uint8).reshape(pix.h, pix.w, 3)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    # 使用Tabula定位表格
    tables = tabula.read_pdf(p, pages=i+1, multiple_tables=True, stream=True, output_format='json')

    for table in tables:
        # 获取表格的坐标信息
        x, y, w, h = table['top'], table['left'], table['width'], table['height']
        # cv2.rectangle(img, (int(x), int(y)), (int(x + w), int(y + h)), (0, 255, 0), 2)

    # 保存图片
    cv2.imwrite(f'/home/FAST_DATA_MIRROR/Langchain-Chatchat-master/pdf_tools/output/img/page_{i}.png', img)